clear all 
set more off 
set maxvar 15000 
clear matrix

    use "$SupplementaryData/trends_extract.dta", clear
	
/*
Note: "trends_extract.dta" was obtained directly from Jonathan Davis and Bhash Mazumder. 
This is the data file corresponding to Figure 3 in their paper "The Decline in Intergenerational Mobility After 1980."

To get a version of this data extract, we refer the reader to: https://direct.mit.edu/rest/article-abstract/doi/10.1162/rest_a_01413/119490/The-Decline-in-Intergenerational-Mobility-after 
and place the corresponding file in the analysis/supplementary data folder
*/ 

*--------------------------------------------------------*
/*  Merge in info on retrospective parental occupation,  
    whether grew up in the south, and race for NLSYM,    
    NLSYW, and NLSY79 respondents.                      */
*--------------------------------------------------------*

    sort id_son
    merge m:1 id_son using "$Mydirectory1/1_DataSources/NLSYM66/wrkdata/MD_benchmarkingexercise_nlsym66.dta"
    assert _merge!=3 if id_son==. | id_son<0 
    drop if _merge==2 
    drop _merge dob_nlsym66 

    sort id_daughter
    merge m:1 id_daughter using "$Mydirectory1/1_DataSources/NLSYW68/wrkdata/MD_benchmarkingexercise_nlsyw68.dta"
    assert _merge!=3 if id_daughter==. 
    drop if _merge==2 
    drop _merge dob_nlsyw68 

    sort CASEID
    merge m:1 CASEID using "$Mydirectory1/1_DataSources/NLSY79/wrkdata/MD_benchmarkingexercise_nlsy79.dta"
    assert _merge!=3 if CASEID==. 
    drop if _merge==2 
    drop _merge dob_nlsy79 
  
*--------------------------------------------------------*
/*  Create 1 variable each for father occupation,       
    mother occupation, whether father was unemployed,    
    and whether R grew up in the South. All 
    retrospective measures.                             */
*--------------------------------------------------------*

    foreach v in fatheroccej motheroccej father_notworking south_merge {
        gen `v'_retro =.

        if "`v'"=="fatheroccej" | "`v'"=="motheroccej" | "`v'"=="father_notworking" {
            replace `v'_retro = `v'_nlsy79 if CASEID!=.
            replace `v'_retro = `v'_nlsym66 if !inlist(id_son,-4,.) & women!=1
            replace `v'_retro = `v'_nlsyw68 if id_daughter!=. & women==1
        }

        if "`v'"=="south_merge" {
            replace `v'_retro = grewup_south_nlsy79 if CASEID!=.
            replace `v'_retro = grewup_south_nlsym66 if !inlist(id_son,-4,.) & women!=1
            replace `v'_retro = grewup_south_nlsyw68 if id_daughter!=. & women==1           
        }

        tab `v'_retro, m
    }

    foreach v in fatheroccej motheroccej father_notworking south_merge {
        if "`v'"=="fatheroccej" | "`v'"=="motheroccej" | "`v'"=="father_notworking" {
            assert `v'_retro==. if `v'_nlsy79==. & `v'_nlsym66==. & `v'_nlsyw68==.
        } 

        if "`v'"=="south_merge" {
            assert `v'_retro==. if grewup_south_nlsy79==. & grewup_south_nlsym66==. & grewup_south_nlsyw68==.
        }       
    }  

*-------------------------------------------*
* Create retrospective race variable 
*-------------------------------------------*
    
    foreach var of varlist race_son race_daughter race {
        if "`var'"=="race_son" local suffix "nlsym66"
        if "`var'"=="race_daughter" local suffix "nlsyw68"
        if "`var'"=="race" local suffix "nlsy79"

        if "`var'"=="race_son" | "`var'"=="race_daughter" {

            clonevar race_father_retro_`suffix' = `var'
            replace race_father_retro_`suffix' =. if race_father_retro_`suffix'==3 //other
        }

        if "`var'"=="race" {
            recode `var' (1=0) (3=1), gen(race_father_retro_`suffix') 
            replace race_father_retro_`suffix' =. if inlist(selfr_race_nlsy79,2,4,8,9,10,13,14,26) & race_father_retro_`suffix'==0 
            replace race_father_retro_`suffix' =1 if race_father_retro_`suffix'==0       
        }
    }

    gen race_retro =.
        replace race_retro = race_father_retro_nlsym66 if !inlist(id_son,-4,.) & women!=1
        replace race_retro = race_father_retro_nlsyw68 if id_daughter!=. & women==1
        replace race_retro = race_father_retro_nlsy79 if CASEID!=.
    tab race_retro, m //1 (white) & 2 (black)  
    
    drop *_nlsy79 *_nlsym66 *_nlsyw68
    rename race race_nlsy79
    
*----------------------------------------------------------------------------*
*----------------------------------------------------------------------------*

*--------------------------------------------------*
*  Merge in Census-based income scores (fathers) 
*--------------------------------------------------*

    /* Preliminary step: give all respondents with father_notworking =1 
                         an occ code of "99"---allows Census non-working 
                         father inc scores to be merged */
    assert fatheroccej==. if father_notworking==1 
    replace fatheroccej =99 if father_notworking==1
    tab father_notworking fatheroccej if father_notworking==1, m 
    tab fatheroccej, m
    
* 1. Occupation x race x south 
    preserve
        use "$CensusData/output/IncomeScores_Coarsened_byrace_bysouth.dta", clear

        foreach var of varlist fatheroccej race south_merge {
            ren `var' `var'_retro
        }

        sort fatheroccej_retro race_retro south_merge_retro
        tempfile census_incscores
        save `census_incscores'
    restore

    sort fatheroccej_retro race_retro south_merge_retro 
    merge m:1 fatheroccej_retro race_retro south_merge_retro using `census_incscores'
    assert fatheroccej_retro==. | race_retro==. | south_merge_retro==. if _merge==1 
    drop if _merge==2
    drop _merge *altwgt flag* *CWfix avgincwage* *_20* avg_inctot_1960_byocc_byr_bys avg_inctot_1970_byocc_byr_bys avg_inctot_1980_byocc_byr_bys avg_inctot_1990_byocc_byr_bys
  
*----------------------------------------------------------------------------*
*----------------------------------------------------------------------------*

*----------------------------------*
* SUPPLEMENTAL 1936 INCOME SCORES
*----------------------------------*

    preserve
        use "$Survey1936/output/ConsumptionSurvey_1936_IncomeScores.dta", clear

        foreach var of varlist fatheroccej race south_merge {
            ren `var' `var'_retro
        }

        sort fatheroccej_retro race_retro south_merge_retro
        tempfile census_1936incscores
        save `census_1936incscores'
    restore

    merge m:1 fatheroccej_retro race_retro south_merge_retro using `census_1936incscores'
    assert inlist(fatheroccej_retro,.,99) | race_retro==. | south_merge_retro==. if _merge==1 
    drop if _merge==2
    drop _merge number_obs_cell_1936 flag_impute_1936 avg_totfaminc_1936_altwt 

    rename fatheroccej_retro fatheroccej
    rename race_retro race
    rename south_merge_retro south_merge 

*----------------------------------------------------------------------------*
*----------------------------------------------------------------------------*

*----------------------------------------------------------------*
* PUT ALL RELEVANT FATHER INCOME SCORES IN 2015$ BEFORE LOGGING
*----------------------------------------------------------------*

    /* Convert fam_inc into 2015 dollars using the CPI: 
       https://data.bls.gov/timeseries/CUUR0000SA0 */ 
    gen CPI1950 = 24.1 
    gen CPI2015 = 237.017

    foreach i of varlist avg_HHinc_* avg_inctot* avg_totfaminc_*  {
        replace `i' = `i' * (CPI2015/CPI1950)
    }

    drop CPI*

*----------------------------------------------------------------------------*
*----------------------------------------------------------------------------*
    
*---------------------------------------------------*
* CREATE LOGGED HH INCOME SCORES   
*---------------------------------------------------*

    // Mix 1940 Census with 1936 survey---occ x race x south
    gen father_HHinc_1936fix = avg_HHinc_1940_byrace_bysouth 
    replace father_HHinc_1936fix= avg_totfaminc_1936 if fatheroccej==81 | fatheroccej==21
    label var father_HHinc_1936fix "Father baseline income score, 1936 farm and self-emp"
    
    gen log_father_HHinc_1936fix = ln(father_HHinc_1936fix)
    label var log_father_HHinc_1936fix "Logged father's baseline HH income, 1936 farm and self-emp. fix"   
    
    //1960-1990
    forval i=1960(10)1990 {
        gen log_father_`i'_byo_byr_bys = ln(avg_HHinc_`i'_byocc_byr_bys)
        label var log_father_`i'_byo_byr_bys "Logged father's household income, `i', by occ x race by south"
    }
    
*----------------------------------------------------------------------------*
*----------------------------------------------------------------------------*

*-------------------------------------------------*
/* Create blended measure of income for 
  (1) working fathers
  (2) working + non-working fathers */
*-------------------------------------------------*
  
    gen age10 = byear+10
    tab byear 

    gen baseline_sample = fatheroccej<99 & race!=. & south_merge!=.

    foreach z in interpolated_retro  {

        gen log_father_`z' =.

        * Choose 1940 measure 
        clonevar log_father_1940_byo_byr_bys = log_father_HHinc_1936fix 

        /* Cohorts born between 1931 and 1950--- 
           give them a weighted average of 1940 
           and 1960 income scores */
        forval i=1941(1)1960 {
            local j=`i'-1940 // # years away from first decade
            local k=20-`j' // # years away from second decade
            
            replace log_father_`z' = ((`k'/20)*log_father_1940_byo_byr_bys) + ((`j'/20)*log_father_1960_byo_byr_bys) if age10==`i'
        }   

        /* 1951-1979 birth cohorts: Give them a weighted average
                                of two Censuses closest to when 
                                the survey respondent turned 10. */
        foreach decade1 in 6 7 /*8*/ {
            local decade2 = `decade1'+1
            
            forval i=19`decade1'1(1)19`decade2'0 {
                local j=`i'-19`decade1'0 // # years away from first decade
                local k=10-`j' // # years away from second decade
                
                replace log_father_`z' = ((`k'/10)*log_father_19`decade1'0_byo_byr_bys) + ((`j'/10)*log_father_19`decade2'0_byo_byr_bys) if age10==`i'
            }
        }

        assert log_father_`z'!=. if log_father_HHinc_1936fix!=.
        label var log_father_`z' "Logged father's income (retrospective), interpolated for each decade (in 2015$)"
        drop log_father_1940_byo_byr_bys

    }
        
        assert log_father_interpolated_retro!=. if baseline_sample==1

    * Make replica for non-working fathers
        assert log_father_interpolated_retro!=. if fatheroccej==99
        clonevar log_father_plusnotworking = log_father_interpolated_retro

    /* SUPER IMPORTANT: NOW EXCLUDE NON-WORKING DADS 
                        FROM BASELINE BLENDED MEASURE */
        replace log_father_interpolated_retro =. if father_notworking_retro==1
        assert log_father_interpolated_retro!=. if baseline_sample==1       

    drop avg_HHinc* avg_totfaminc* avg_inctot* *1936fix*
    
*----------------------------------------------------------------------------*
*----------------------------------------------------------------------------*
    
*------------------------------------------------------------*
*  Merge in Census-based income scores (working mothers)  
*------------------------------------------------------------*
    
    rename motheroccej_retro motheroccej
    label var motheroccej "Mother's coarsened occupation"
    
* 1. Occupation x race x south
    merge m:1 motheroccej race south_merge using "$CensusData/output/IncomeScores_Coarsened_byrace_bysouth_moms.dta"
    assert motheroccej==. | race==. | south_merge==. if _merge==1 
    drop if _merge==2
    drop _merge mom_avg_inctot_1950_byocc_rs

*----------------------------------------------------------------*
* PUT ALL RELEVANT MOTHER INCOME SCORES IN 2015$ BEFORE LOGGING
*----------------------------------------------------------------*

    /* Convert fam_inc into 2015 dollars using the CPI: 
       https://data.bls.gov/timeseries/CUUR0000SA0 */ 
    gen CPI1950 = 24.1 
    gen CPI2015 = 237.017

    foreach i of varlist mom_HHinc* mom_avg* {
        replace `i' = `i' * (CPI2015/CPI1950)
    }

    drop CPI*
    
*------------------------------------------*
* LOG VARIOUS MOTHER INCOME SCORES
*------------------------------------------*
    
* BASELINE (Occupation x race x south)

    //1940
    gen log_mother_1940_byo_byr_bys  = ln(mom_HHinc_byr_bys_CWfix)
    label var log_mother_1940_byo_byr_bys "Logged mother's household income, 1940, by occ x race x south, CW fix"

    //1960-1990 
    forval i=1960(10)1990 {
        gen log_mother_`i'_byo_byr_bys = ln(mom_avg_HHinc_`i'_byocc_rs)
        label var log_mother_`i'_byo_byr_bys "Logged mother's household income, `i', by occ x race x south"
    }

*------------------------------------------*
* BLENDED MOTHER INCOME SCORES
*------------------------------------------*
    
    gen mothers_samp = motheroccej!=. & race!=. & south_merge!=.
    
    gen log_mother_interpolated_retro =.
    
    /* Cohorts born between 1931 and 1950--- 
       give them a weighted average of 1940 
       and 1960 income scores */
    forval i=1941(1)1960 {
    
        local j=`i'-1940 //# years away from first decade
        local k=20-`j' //# years away from second decade
        
        replace log_mother_interpolated_retro = ((`k'/20)*log_mother_1940_byo_byr_bys) + ((`j'/20)*log_mother_1960_byo_byr_bys) if age10==`i'
    }

    /* 1951-1979 birth cohorts: Give them a weighted average
                                of two Censuses closest to when 
                                the survey respondent turned 10. */
    foreach decade1 in 6 7 /*8*/ {
        local decade2 = `decade1'+1
        
            forval i=19`decade1'1(1)19`decade2'0 {
            
                local j=`i'-19`decade1'0 //# years away from first decade
                local k=10-`j' //# years away from second decade
                
                replace log_mother_interpolated_retro = ((`k'/10)*log_mother_19`decade1'0_byo_byr_bys) + ((`j'/10)*log_mother_19`decade2'0_byo_byr_bys) if age10==`i'
        }
    }
    
    label var log_mother_interpolated_retro "Logged mother's HH inc.(race x south), interp. 1940-1990, CW fix"
    assert log_mother_interpolated_retro!=. if mothers_samp==1
    
    drop mom_* 
    
*----------------------------------------------------------------------------*
*----------------------------------------------------------------------------*    

*----------------------------------------------------------------*
* Create logged, interpolated measure for all dads, no moms
*----------------------------------------------------------------*
    
    gen all_dads_income_retro = log_father_interpolated_retro
    replace all_dads_income_retro = log_father_plusnotworking if all_dads_income_retro==. & father_notworking_retro==1
    label var all_dads_income_retro "Baseline (logged) income for fathers (retro, interp), working and non-working"
    
    assert all_dads_income_retro!=. if (fatheroccej!=.) & race!=. & south_merge!=.  

*----------------------------------------------------------------*
/* Create logged, interpolated measure for all parents 
   (2 types of dads and mom) */
*----------------------------------------------------------------*

    gen parent_income_all_retro = log_father_interpolated_retro
    replace parent_income_all_retro = log_mother_interpolated_retro if parent_income_all_retro==. & log_mother_interpolated_retro!=. 
    replace parent_income_all_retro = log_father_plusnotworking if parent_income_all_retro==. & log_father_plusnotworking!=.
    label var parent_income_all_retro "Baseline (logged) income (retro, interp) for fathers, unemp fathers, and mothers"
    
    assert parent_income_all_retro!=. if (fatheroccej!=. | motheroccej!=.) & race!=. & south_merge!=.

*-------------------------*
* Samples
*-------------------------*
/*Note: ~1300 more obs are dropped when using the predicted
         parental income measure than when using the actual 
         income measure from Davis and Mazumder (2022) */
        
    gen MD_sample = (miss0==0 & miss1==0) 

    gen comb_sample = ((miss0==0 & miss1==0) & log_father_interpolated_retro!=.)
    gen comb_sample_plusnonwdads = ((miss0==0 & miss1==0) & all_dads_income_retro!=.)
    gen comb_sample_allparents = ((miss0==0 & miss1==0) & parent_income_all_retro!=.)

    drop age10 baseline_sample mothers_samp
    
    save "$Mydirectory2/appendix_d/DM_sample_foranalysis.dta", replace
